home *** CD-ROM | disk | FTP | other *** search
- /* SGML_stream.c
- * $Id: SGMLstream.c,v 1.3 93/01/06 18:40:28 connolly Exp Locker: connolly $
- */
-
- /* implements... */
- #include "SGML.h"
-
- /* uses ... */
- #include "object.h"
- #include <ctype.h>
- #include <assert.h>
- #include <string.h>
-
-
- VOID
- SGML_parseInstance(stream, getch, document, docclass)
- HMStream stream;
- HMGetcProc *getch;
- HMDoc* document;
- CONST HMDoc_Class *docclass;
- {
- static char RE[] = "\n";
- char REbuffer[1 + SGML_LITLEN + SGML_NAMELEN + 4];
- char *buffer = REbuffer + 1;
- int content = SGML_MIXED;
- int lookahead = EOF;
- int len, read;
- char gi[SGML_NAMELEN+1];
- HMBinding attributes[SGML_ATTCNT];
- int attrqty;
- char eat_next_RE = 1, RE_pending = 0;
-
- REbuffer[0] = '\n'; /*@@ should be 13, not 10! */
-
- while( (read = SGML_read(stream, getch, buffer, sizeof(REbuffer) - 2,
- content, &lookahead)) != EOF){
- switch(read){
- case SGML_start_tag:
- if (RE_pending){
- (docclass->data)(document, RE, 1);
- }
-
- len = SGML_read_name(stream, getch, gi, &lookahead);
- gi[len] = 0;
-
- attrqty = 0;
- while(isalpha(lookahead)){ /* iterate over attributes */
- len = SGML_read_name(stream, getch,
- buffer, &lookahead);
- buffer[len] = 0;
-
- if(lookahead == '='){
- int offset = len + 1;
- HMBinding* attr = &attributes[attrqty++];
-
- lookahead = EOF;
- /* @@ entity references in attribute value */
- len += SGML_read_value(stream,
- getch,
- buffer + offset,
- &lookahead) + 1;
- buffer[len++] = '\0';
- attr->name = NEW(char, len);
- memcpy(attr->name, buffer, len);
- attr->value = attr->name + offset;
- }
- }
-
- /* look for tag close */
- while(isspace(lookahead))
- lookahead = (getch)(stream);
- lookahead = EOF;
-
- {
- int i;
- int c;
-
- c = (docclass->startTag)(document, gi, attributes, attrqty);
-
- if(c == SGML_EMPTY){
- eat_next_RE = 0;
- }else{
- content = c;
- eat_next_RE = 1;
- }
-
- for(i=0; i<attrqty; i++)
- FREE(attributes[i].name);
- }
- RE_pending = 0;
- break;
-
- case SGML_end_tag:
- /* drop pending RE */
-
- len = SGML_read_name(stream, getch, gi, &lookahead);
- gi[len] = 0;
-
- /* look for tag close */
- while(isspace(lookahead))
- lookahead = (getch)(stream);
- lookahead = EOF;
-
- (docclass->endTag)(document, gi);
- content = SGML_MIXED; /* @@ could be element */
- eat_next_RE = 0;
- RE_pending = 0;
- break;
-
- case SGML_entity:
- if (RE_pending){
- (docclass->data)(document, RE, 1);
- }
- eat_next_RE = 0;
- RE_pending = 0;
-
- {
- CONST char* text = (docclass->entityText)(document, buffer);
-
- if(text)
- (docclass->data)(document, text, strlen(text));
- }
- break;
-
- case SGML_record_end:
- if(eat_next_RE){
- eat_next_RE = 0;
- RE_pending = 0;
- }
- else if (RE_pending){
- (docclass->data)(document, RE, 1);
- }
- else
- RE_pending = 1;
-
- break;
-
- default:
- buffer[read] = 0;
- if(RE_pending)
- (docclass->data)(document, REbuffer, read + 1);
- else
- (docclass->data)(document, buffer, read);
- RE_pending = 0;
- eat_next_RE = 0;
- break;
- }
- }while(read != EOF);
- }
-
-
- /*****
- * lexical analysis
- *****/
-
- int
- SGML_read(stream, getch,
- buf, nbytes,
- content,
- inout_lookahead)
- HMStream stream;
- HMGetcProc* getch;
- char* buf;
- int nbytes;
- int content;
- int* inout_lookahead;
- {
- int c; /* state machine input character */
- enum { /* state machine states */
- start, data, cdata, rcdata, pcdata,
- and, and_hash, cref, entity,
- lt, lt_slash, tag,
- pi,
- lt_bang, lt_bang_dash,
- comment, comment_dash, ps
- } state = start;
- /* auxiliary state: */
- int end_tag; /* saw '/' after '<' */
- char name[SGML_NAMELEN + 1]; /* function character name */
- int name_chars;
-
- int ret = 0; /* number of characters read */
-
- #define LOOKAHEAD(n) (ret + n < nbytes)
- #define REDUCE(s) { state = (s); break; }
- #define SHIFT(s) { state = (s); continue; }
- #define DONE(c) { *inout_lookahead = (c); return ret; }
- #define WRITE(c) { *buf++ = (c); ret++; }
-
- /* prime the pump */
- if((c = *inout_lookahead) == EOF)
- c = (getch)(stream);
-
- /* state machine...*/
- while(ret < nbytes){
-
- switch(state){
-
- case start:
- if(c == EOF) return EOF;
- else if(c == '\n') { ret = SGML_record_end; DONE(EOF); }
- else if(c == '<'){
- if(LOOKAHEAD(3)) { REDUCE(lt); }
- else { DONE(c); } /* no room for lookahead */
- }else if(c == '&'){
- if(LOOKAHEAD(2)) { REDUCE(and); }
- }else if(content == SGML_ELEMENT && isspace(c)){
- break; /* ignore whitespace in ELEMENT content */
- }else { SHIFT(data); }
-
- case data:
- if(content == SGML_ELEMENT){
- if(isspace(c)){
- break;
- }else{
- *buf = 0; ret = 0; DONE(c);
- }
- }else if(content == SGML_CDATA){ SHIFT(cdata); }
- else if(content == SGML_RCDATA){ SHIFT(rcdata); }
- else /* assume SGML_MIXED */ { SHIFT(pcdata); }
-
- case cdata:
- if(c == EOF || c == '<' || c == '\n') { DONE(c); }
- else{ WRITE(c); break; }
-
- case rcdata:
- case pcdata:
- if(c == EOF || c == '<' || c == '&' || c == '\n') { DONE(c); }
- else{ WRITE(c); break; }
-
- case and:
- if(c == '#') { REDUCE(and_hash); }
- else if(isalpha(c)) {
- if(LOOKAHEAD(SGML_NAMELEN+1)){
- name_chars = 0; SHIFT(entity);
- }else{
- DONE(c); /* error: no room for entity name */
- }
- }
- else{ WRITE('&'); SHIFT(data); }
-
- case entity:
- if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
- WRITE(c);
- break;
- }
- else{
- WRITE('\0');
- ret = SGML_entity;
- if(c == ';' || c == '\n'){ DONE(EOF); /* eat ; */ }
- else{ DONE(c); /* ended char ref with other char */ }
- }
-
- case and_hash:
- if(isalnum(c)){ name_chars = 0; SHIFT(cref); }
- else{ WRITE('&'); WRITE('#'); SHIFT(data); }
-
- case cref:
- /* auxiliary state: name_chars */
- if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
- if(name_chars < SGML_NAMELEN)
- name[name_chars++] = c;
- /* else markup error: name too long */
- break;
- }
- else{
- int nc = 0;
-
- name[name_chars] = '\0';
- if(isdigit(name[0])){
- nc = atoi(name);
- }else if(!strcmp(name, "SPACE")){
- nc = 32;
- }else if(!strcmp(name, "RS")){
- nc = 10;
- }else if(!strcmp(name, "RE")){
- nc = 13;
- }
-
- if(nc) WRITE(nc); /* else error: bad character reference */
-
- if(c == ';') { REDUCE(data); }
- else
- /* terminate entity reference w/space or something */
- { SHIFT(data); }
- }
-
- case lt:
- if(c == '/') { REDUCE(lt_slash); }
- if(content == SGML_MIXED || content == SGML_ELEMENT){
- if(c == '?') { REDUCE(pi); }
- else if(c == '!') { REDUCE(lt_bang); }
- else if(isalpha(c)) { end_tag = 0; SHIFT(tag); }
- }
- WRITE('<'); SHIFT(data);
-
- case lt_slash:
- if(isalpha(c)) { end_tag = 1; SHIFT(tag); }
- else { WRITE('<'); WRITE('/'); SHIFT(data); }
-
- case tag:
- /* auxiliary state: end_tag */
- ret = end_tag ? SGML_end_tag : SGML_start_tag;
- DONE(c);
-
- case pi: /* processing instruction (or markup declaraion) */
- if(c == '>') { REDUCE(start); }
- else if(c == EOF) { SHIFT(start); } /* error: EOF in pic */
- else break;
-
- case lt_bang:
- if(c == '-') { REDUCE(lt_bang_dash); }
- /*
- * *** NON CONFORMING IMPLEMENTATION ***
- * a letter here starts a markup declaration, which isn't supported
- * a [ starts a marked section, which isn't supported.
- * treat them like processing instructions.
- */
- else if(c == '[' || isalpha(c)) { REDUCE(pi); }
- else if(c == '>') { REDUCE(start); }
- else{ WRITE('<'); WRITE('!'); SHIFT(data); }
-
- case lt_bang_dash:
- if(c == '-') { REDUCE(comment); }
- else{ WRITE('<'); WRITE('!'); WRITE('-'); SHIFT(data); }
-
- case comment:
- if(c == '-') { REDUCE(comment_dash); }
- else if(c == EOF) { DONE(c); } /* error: eof in comment */
- else break;
-
- case comment_dash:
- if(c == '-') { REDUCE(ps); }
- else if(c == EOF) { DONE(c); }/* error: eof in comment */
- else { REDUCE(comment); }
-
- case ps: /* parameter separator between -- and > */
- if(c == EOF) { DONE(c); }
- else if(isspace(c)) break;
- else { REDUCE(start); }/* error if c !='>' */
-
- }
- c = (getch)(stream);
- }
-
- DONE(c); /* set up lookahead for next call */
- #undef S
- #undef LOOKAHEAD
- #undef REDUCE
- #undef SHIFT
- #undef DONE
- #undef WRITE
- }
-
-
- int
- SGML_read_name(stream, getch, buf, inout_lookahead)
- HMStream stream;
- HMGetcProc* getch;
- char* buf;
- int* inout_lookahead;
- {
- int name_chars = 0;
- int c = *inout_lookahead;
-
- if(!isalpha(c)) return 0;
-
- do{
- if(name_chars <= SGML_NAMELEN)
- buf[name_chars++] = toupper(c);
- /* else error: name too long */
- c = (getch)(stream);
- }while(isalnum(c) || strchr(SGML_UCNMCHAR, c));
-
- while(isspace(c))
- c = (getch)(stream);
-
- *inout_lookahead = c;
- return name_chars;
- }
-
-
- int
- SGML_read_value (stream,
- getch,
- buf,
- inout_lookahead)
- HMStream stream;
- HMGetcProc* getch;
- char* buf;
- int* inout_lookahead;
- {
-
- int c; /* state machine input character */
- enum { /* state machine states */
- start,
- literal,
- and, and_hash, cref,
- #if defined(SGML_SHORTTAG) || defined(GROK_UNQUOTED_LITERALS)
- value,
- #endif
- ps
- } state = start;
- /* auxiliary state: */
- char quote; /* which kind of quote */
-
- int ret = 0; /* number of characters read */
- char name[SGML_NAMELEN + 1]; /* entity name */
- int name_chars;
-
- #define LOOKAHEAD(n) (ret + n < SGML_LITLEN)
- #define REDUCE(s) { state = (s); break; }
- #define SHIFT(s) { state = (s); continue; }
- #define DONE(c) { *inout_lookahead = (c); return ret; }
- #define WRITE(c) { *buf++ = (c); ret++; }
-
- /* prime the pump */
- if((c = *inout_lookahead) == EOF)
- c = (getch)(stream);
-
- /* state machine...*/
- while(ret < SGML_LITLEN){
-
- switch(state){
-
- case start:
- if(c == EOF) return EOF;
- else if(c == '"') { quote = c; REDUCE(literal); }
- else if(c == '\'') { quote = c; REDUCE(literal); }
- else if(isspace(c)) break;
- #ifdef GROK_UNQUOTED_LITERALS
- else if(!(c == '>')){
- SHIFT(value);
- }
- #else
- #ifdef SGML_SHORTTAG
- else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
- SHIFT(value);
- }
- #else
- else { DONE(c); } /* error: illegal char in markup */
- #endif
- #endif
-
- #ifdef GROK_UNQUOTED_LITERALS
- case value:
- if(c == EOF) { DONE(c); }
- else if(isspace(c) || c == '>'){ SHIFT(ps); }
- else{
- WRITE(c);
- break;
- }
- #else
- #ifdef SGML_SHORTTAG
- case value:
- if(c == EOF) { DONE(c); }
- else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
- WRITE(c);
- break;
- }else{ SHIFT(ps); }
- #endif
- #endif
-
- case literal:
- if(c == EOF) { DONE(c); }
- else if(c == quote) { REDUCE(ps); }
- else if(c == '&'){ REDUCE(and); }
- else if(c == '\n' || c == '\t'){ WRITE(' '); break; }
- else{
- WRITE(c);
- break;
- }
-
- case and:
- if(c == '#') { REDUCE(and_hash); }
- /*@@ else if(isalpha(c)) ... process entity reference */
- else{ WRITE('&'); SHIFT(literal); }
-
- case and_hash:
- if(isalnum(c)){ name_chars = 0; SHIFT(cref); }
- else{ WRITE('&'); WRITE('#'); SHIFT(literal); }
-
- case cref:
- /*@@ in case of
xyz, this throws out xyz as error, when
- it should only throw out x */
- if(isdigit(c) || isalpha(c)
- || strchr(SGML_UCNMCHAR, c)){
- if(name_chars < SGML_NAMELEN)
- name[name_chars++] = c;
- /* else markup error: name too long */
- break;
- }
- else{
- int nc = 0;
-
- name[name_chars] = '\0';
- if(isdigit(name[0])){
- nc = atoi(name);
- }else if(!strcmp(name, "SPACE")){
- nc = 32;
- }else if(!strcmp(name, "RS")){
- nc = 10;
- }else if(!strcmp(name, "RE")){
- nc = 13;
- }else
- break;
-
- if(nc) WRITE(nc); /* else error: bad character reference */
-
- if(c == ';') { REDUCE(literal); }
- else
- /* terminate entity reference w/space or something */
- { SHIFT(literal); }
- }
-
- case ps: /* parameter separator between attributes */
- if(isspace(c)) break;
- else { DONE(c); }
-
- }
- c = (getch)(stream);
- }
-
- /* error: attribute value too long */
-
- DONE(EOF); /* set lookahead to EOF for next call */
- #undef S
- #undef LOOKAHEAD
- #undef REDUCE
- #undef SHIFT
- #undef DONE
- #undef WRITE
- }
-